
import pandas as pd
from _funcs import parallel_read_csv, get_chunks, read_csv_chunk

dir = r'/Volumes/Zihao_SSD2/PatentsView/'

## ==================================================================================================================
## Generate omission panel without self-citation
## Zihao Li. 06/2024

## Inputs:  temp/actual_citation_lst.csv
##          cleandata/sim_score_1981_2015_top5_noselfcite.csv

## Output:  temp/omission_panel5_noselfcite.csv
## ==================================================================================================================


def noselfcite_k(actual_citation_file):
    ## Load citation data
    print('Loading citation data...')
    df_actual_lst = parallel_read_csv(dir + actual_citation_file)
    df = pd.read_csv(dir + 'cleandata/sim_score_1981_2015_top5_noselfcite.csv', low_memory=False)
    df = df.drop(columns=['patent_idx', 'cited_patent_idx', 'patent_year', 'cited_patent_year'])
    df = df.sort_values(by=['patent_id', 'sim_score', 'cited_patent_id'], ascending=[True, False, False])

    # Merging actual_citation_list with artificial citation data
    print('    Merging actual_citation_list with artificial citation data...')
    df = df.merge(df_actual_lst, how='left', on='patent_id')
    print('    Shape of merged dataset is {}.'.format(df.shape)) # (24757829, 9)

    # Generating omission index (excluding self-citation)
    print('    Generating omission index...')
    df = df.dropna(subset=['actual_citation_list'])
    df['omission'] = df.apply(lambda row: 1 if row['no_overlap_inventors'] == "False" or str(row['cited_patent_id']) not in row['actual_citation_list'] else 0, axis=1)
    print('Shape of omission dataset is {}.'.format(df.shape)) # (23745773, 10)

    # Exporting omission panel
    print('Exporting omission panel...')
    df.to_csv(dir + f'temp/omission_panel5_noselfcite.csv', index=False)
    del df


def main():
    actual_citation_file = 'temp/actual_citation_lst.csv'
    noselfcite_k(actual_citation_file=actual_citation_file)


if __name__ == "__main__":
    main()